import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import statsmodels.api as sm
import seaborn as sns

from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn import tree, linear_model 
from sklearn.metrics.pairwise import linear_kernel, sigmoid_kernel, cosine_similarity

%matplotlib inline

/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:7: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:7: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


# Processing meta data
data = []
with open('meta_Luxury_Beauty.json', 'r') as f:
    for l in tqdm(f):
        data.append(json.loads(l))

/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_40221/724658956.py:4: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for l in tqdm(f):

0it [00:00, ?it/s]


df_title = pd.DataFrame(columns = ['asin','title'])
for i in range(len(data)):
    df_title.loc[i,'asin'] = data[i]['asin']
    df_title.loc[i,'title'] = data[i]['title']


len(df_title)

12299


# Read the data frame and drop non-related columns
df = pd.read_csv("Luxury_Beauty.csv",low_memory=False)
df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'style/Size:', 'reviewerName', 'reviewText', 'summary',
       'unixReviewTime', 'vote', 'style/Flavor Name:', 'style/Color:',
       'image/0', 'image/1', 'image/2', 'image/3', 'image/4', 'image/5',
       'image/6', 'image/7', 'image/8', 'style/Format:', 'style/Style Name:',
       'style/Style:', 'style/Scent:', 'style/Package Quantity:',
       'style/Flavor:', 'style/Package Type:', 'style/Scent Name:'],
      dtype='object')


df = df.drop(columns = ['style/Size:','vote','style/Flavor Name:', 'style/Color:',
       'image/0', 'image/1', 'image/2', 'image/3', 'image/4', 'image/5',
       'image/6', 'image/7', 'image/8', 'style/Format:', 'style/Style Name:',
       'style/Style:', 'style/Scent:', 'style/Package Quantity:',
       'style/Flavor:', 'style/Package Type:', 'style/Scent Name:'])
df.head(3)


len(df)

34278


# len(df[df.verified == False]) #16517
# 这里有个column verified不清楚什么意思，但是verified为false的row很多


# Merge two dataframes with the same asin
beauty = pd.merge(df, df_title, on="asin", how="left")
beauty.head(3)


print('Dataset size: {:,} words'.format(len(beauty)))

Dataset size: 35,858 words


# Reformat datetime from raw form
beauty['reviewTime'] = pd.to_datetime(beauty['reviewTime'])


# Rearrange the left-to-right by relevance
beauty = beauty[['asin', 'title', 'summary', 'reviewText', 'overall', 'reviewerID', 'reviewerName', 
                 'reviewTime', 'unixReviewTime']]
beauty.head(3)


beauty['year']=beauty['reviewTime'].dt.year
beauty['month']=beauty['reviewTime'].dt.month
beauty.head(3)


beauty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35858 entries, 0 to 35857
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   asin            35858 non-null  object        
 1   title           35808 non-null  object        
 2   summary         35841 non-null  object        
 3   reviewText      35845 non-null  object        
 4   overall         35858 non-null  int64         
 5   reviewerID      35858 non-null  object        
 6   reviewerName    35858 non-null  object        
 7   reviewTime      35858 non-null  datetime64[ns]
 8   unixReviewTime  35858 non-null  int64         
 9   year            35858 non-null  int64         
 10  month           35858 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(6)
memory usage: 3.3+ MB


beauty = beauty.dropna(subset=['reviewText'])


beauty.asin[pd.isna(beauty.title)].unique()

array(['B0015ZC1FG', 'B0015Z90AA', 'B00172IEVM', 'B001C0W8QG'],
      dtype=object)


#beauty.title[beauty.asin == 'B0015ZC1FG'] = 'Mario Badescu Mario Badescu Mask 2 Oz'
#beauty.title[beauty.asin == 'B0015Z90AA'] = 'Mario Badescu Collagen Moisturizer SPF 15 for Combination & Sensitive Skin| Daytime Face Cream with Collagen & Cottonseed Oil | Softens the Look of Dry Lines | 2 Fl Oz'
#beauty.title[beauty.asin == 'B00172IEVM'] = 'Mario Badescu Chamomile Shampoo'
#beauty.title[beauty.asin == 'B001C0W8QG'] = 'La Roche-Posay Respectissime Liquid Eyeliner, 0.04 Fl oz'


beauty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35845 entries, 0 to 35857
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   asin            35845 non-null  object        
 1   title           35795 non-null  object        
 2   summary         35828 non-null  object        
 3   reviewText      35845 non-null  object        
 4   overall         35845 non-null  int64         
 5   reviewerID      35845 non-null  object        
 6   reviewerName    35845 non-null  object        
 7   reviewTime      35845 non-null  datetime64[ns]
 8   unixReviewTime  35845 non-null  int64         
 9   year            35845 non-null  int64         
 10  month           35845 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(6)
memory usage: 3.3+ MB


len(beauty['asin'].unique())

1581


sum(beauty.duplicated())

5772


beauty = beauty.drop_duplicates()


beauty = beauty.reset_index(drop = True)


beauty['asin'].value_counts().head(20)

B003OGV7UO    694
B004N2S2JM    694
B0006PLMFQ    462
B000J4FGAG    459
B0058TE4WI    458
B007PORYUI    458
B00B59AULY    458
B00699JDKY    457
B00BXS9PFE    439
B00DTH63P2    439
B00014GT8W    307
B002K6AHQY    263
B00H2VO6P0    250
B0013U0EYI    242
B0002ZW5UQ    242
B00M0V39VE    235
B002HG7NX2    174
B000NG80GM    155
B000142FVW    152
B00J66M2SM    152
Name: asin, dtype: int64


print(beauty.title[beauty.asin == 'B003OGV7UO'].unique())
print(beauty.title[beauty.asin == 'B004N2S2JM'].unique())

['Creative Nail Design Shellac UV Color Coat, 25 Ounce']
['Creative Nail Design Shellac UV Color Coat, 25 Ounce']


f, axes = plt.subplots(2,2, figsize=(14,11))
yearly = beauty.groupby(['year'])['reviewerID'].count().reset_index()
yearly = yearly.rename(columns={'reviewerID':'no_of_reviews'})
yearChart = sns.lineplot(x='year',y='no_of_reviews',data=yearly, ax = axes[0,0])
yearChart.set_title('No. of reviews over years')

monthly = beauty.groupby(['month'])['reviewerID'].count().reset_index()
monthly = monthly.rename(columns={'reviewerID':'no_of_reviews'})
monthChart = sns.barplot(x='month',y='no_of_reviews',data=monthly, ax = axes[0,1])
monthChart.set_title('No. of reviews over month')
monthChart.set_xticklabels(monthChart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')

sns.countplot(x = 'overall', data = beauty, ax = axes[1,0] ).set_title('Overall Reviews')

beauty_reviews = beauty.loc[:,['asin','reviewerID','reviewerName','reviewText','summary','overall']]
beauty_reviews['reviewLength'] = beauty_reviews['reviewText'].apply(lambda x: len(x.split()))
reviews_word_length = beauty_reviews.groupby(pd.cut(beauty_reviews.reviewLength, np.arange(0,1000,100))).count()
reviews_word_length = reviews_word_length.rename(columns={'reviewLength':'count'})
reviews_word_length = reviews_word_length.reset_index()
#print(reviews_word_length)
reviewLengthChart = sns.barplot(x='reviewLength',y='count',data=reviews_word_length, ax = axes[1,1])
reviewLengthChart.set_title('Distribution of Reviews by word length')
reviewLengthChart.set_xticklabels(reviewLengthChart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')

f.tight_layout()


from wordcloud import WordCloud
text = " ".join(word for word in beauty['reviewText'].astype(str))
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
wordcloud.generate(text)
wordcloud.to_image()


import re
from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  text = text.lower() #lowercase
  text = re.sub(r'[^\w\s]', '', text) #remove punctuations
  text = re.sub(r'\d+', '', text) #remove numbers
  text = " ".join(text.split()) #stripWhitespace
  text = text.split()
  text = [x for x in text if x not in stop_words] #remove stopwords
  #text = [x for x in text if x not in ['product']] #remove task specific stopwords
  text = " ".join(text)
  # stemmer_ps = PorterStemmer()  
  # text = [stemmer_ps.stem(word) for word in text.split()] #stemming
  # text = " ".join(text)
  # lemmatizer = WordNetLemmatizer()
  # text = [lemmatizer.lemmatize(word) for word in text.split()]  #lemmatization
  # text = " ".join(text)
  return(text)


from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS


beauty['review_processed'] = beauty['reviewText'].apply(lambda x:preprocess(str(x)))
beauty['review_processed'] = beauty['review_processed'].apply(lambda x:x.split())
beauty.head(3)


beauty['pos_neg'] = [1 if x > 3 else 0 for x in beauty.overall]
beauty.head(3)


x_train, x_test, y_train, y_test = train_test_split(beauty.reviewText, beauty.pos_neg, random_state=0)


print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(22554,) (7519,) (22554,) (7519,)


alltext= x_train.tolist()+ x_test.tolist()


vectorizer = CountVectorizer(min_df=5).fit(x_train)
X_train = vectorizer.transform(x_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<22554x8773 sparse matrix of type '<class 'numpy.int64'>'
	with 1248545 stored elements in Compressed Sparse Row format>


feature_names = vectorizer.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))

Number of features: 8773


logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
X_test = vectorizer.transform(x_test)
log_y_pred = logreg.predict(X_test)
logreg_score = accuracy_score(y_test, log_y_pred)
print("Accuracy:   {:.3f}".format(logreg_score))

/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Accuracy:   0.869


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.907
Test set score: 0.869


log_cfm = confusion_matrix(y_test, log_y_pred)
print("Confusion matrix:")
print(log_cfm, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))

Confusion matrix:
[[ 744  720]
 [ 268 5787]]

---------------
[['TN' 'FP']
 ['FN' 'TP']]


plt.imshow(log_cfm, interpolation='nearest')

for i, j in itertools.product(range(log_cfm.shape[0]), range(log_cfm.shape[1])):
    plt.text(j, i, log_cfm[i, j],
             horizontalalignment="center",
             color="white")

plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Logistic Reg | Confusion Matrix')
plt.colorbar();


log_f1 = f1_score(y_test, log_y_pred)
print("Logistic Reg - F1 score: {:.3f}".format(log_f1))

Logistic Reg - F1 score: 0.921


mnb = MultinomialNB(alpha=.01)
mnb.fit(X_train, y_train)

MultinomialNB(alpha=0.01)


mnb_y_pred = mnb.predict(X_test)


mnb_score = accuracy_score(y_test, mnb_y_pred)
print("Accuracy:   {:.3f}".format(mnb_score))

Accuracy:   0.856


print("Training set score: {:.3f}".format(mnb.score(X_train, y_train)))
print("Test set score: {:.3f}".format(mnb.score(X_test, y_test)))

Training set score: 0.895
Test set score: 0.856


mnb_cfm = confusion_matrix(y_test, mnb_y_pred)
print("Confusion matrix:")
print(mnb_cfm, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))

Confusion matrix:
[[ 856  608]
 [ 476 5579]]

---------------
[['TN' 'FP']
 ['FN' 'TP']]


plt.imshow(mnb_cfm, interpolation='nearest')

for i, j in itertools.product(range(mnb_cfm.shape[0]), range(mnb_cfm.shape[1])):
    plt.text(j, i, mnb_cfm[i, j],
             horizontalalignment="center",
             color="white")

plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Multinomial | Confusion Matrix')
plt.colorbar();


mnb_f1 = f1_score(y_test, mnb_y_pred)
print("Multinomial NB - F1 score: {:.3f}".format(mnb_f1))

Multinomial NB - F1 score: 0.911


mnb_tfidfvectorizer = TfidfVectorizer(min_df=5).fit(x_train)
mnb_X_train = mnb_tfidfvectorizer.transform(x_train)
print("X_train:\n{}".format(repr(mnb_X_train)))

X_train:
<22554x8773 sparse matrix of type '<class 'numpy.float64'>'
	with 1248545 stored elements in Compressed Sparse Row format>


mnb_X_test = mnb_tfidfvectorizer.transform(x_test)
mnb_y_pred = mnb.predict(mnb_X_test)


mnb_score2 = accuracy_score(y_test, mnb_y_pred)
print("Accuracy:   {:.3f}".format(mnb_score2))

Accuracy:   0.820


print("Training set score: {:.3f}".format(mnb.score(mnb_X_train, y_train)))
print("Test set score: {:.3f}".format(mnb.score(mnb_X_test, y_test)))

Training set score: 0.835
Test set score: 0.820


mnb_cfm2 = confusion_matrix(y_test, mnb_y_pred)
print("Confusion matrix:")
print(mnb_cfm2, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))

Confusion matrix:
[[ 145 1319]
 [  31 6024]]

---------------
[['TN' 'FP']
 ['FN' 'TP']]


plt.imshow(mnb_cfm2, interpolation='nearest')

for i, j in itertools.product(range(mnb_cfm2.shape[0]), range(mnb_cfm2.shape[1])):
    plt.text(j, i, mnb_cfm2[i, j],
             horizontalalignment="center",
             color="white")

plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Multinomial | Confusion Matrix')
plt.colorbar();


mnb2_f1 = f1_score(y_test, mnb_y_pred)
print("Multinomial NB - F1 score: {:.3f}".format(mnb2_f1))

Multinomial NB - F1 score: 0.899


beauty.head(3)


review_text = beauty['reviewText'].apply(lambda x:preprocess(x))


# Use tf-idf features
from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(review_text)

# Use tf features
tf_vectorizer = CountVectorizer(stop_words=stop_words)
tf = tf_vectorizer.fit_transform(review_text)


tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print("Number of total features: {}".format(len(tfidf_feature_names)))

Number of total features: 33037


# Initialize NMF
nmf = NMF(n_components=10, random_state=1,
          alpha=.1, l1_ratio=.5)

# Initialize Ida
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)


num_top_words = 15

def retrieve_top_words(model, feature_names, num_top_words):
    for idx, topic in enumerate(model.components_):
        print("Topic #{}:".format(idx), end='\n')
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]), end='\n\n')
    print()


nmf_tf = nmf.fit(tf)

/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead
  warnings.warn(
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:289: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1637: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn(


nmf_ = nmf_tf.transform(tf)
Counter([np.argmax(i) for i in nmf_])

/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead
  warnings.warn(

Counter({9: 3063,
         7: 1537,
         0: 3836,
         4: 3396,
         6: 4002,
         3: 2903,
         5: 7413,
         8: 1090,
         1: 390,
         2: 2443})


retrieve_top_words(nmf_tf, tfidf_feature_names, num_top_words)

Topic #0:
skin dry sensitive products oil serum moisturizer oily feels feel does foundation care ingredients using

Topic #1:
dermablend loreal professional spf cover look powder products nice makeup setting product skin light medium

Topic #2:
hair use dry shampoo iron conditioner using just used products long great oil heat hold

Topic #3:
like really dont just im feel doesnt look little ive does didnt think good smell

Topic #4:
product products used price good great does using did apply ingredients try little im results

Topic #5:
color polish love great colors nail coat nails looks look pink coats just nice essie

Topic #6:
use face using just used time dont makeup ive im day really little results did

Topic #7:
cream shave shaving eye products night creams hand using eyes used razor good does hands

Topic #8:
brush foundation makeup powder brushes shaving use bristles coverage shave clarisonic soap good handle blush

Topic #9:
scent smell fragrance bottle nice love perfume strong light body good lotion smells floral does


lda_tf = lda.fit(tf)


lda_ = lda_tf.transform(tf)
Counter([np.argmax(i) for i in lda_])

Counter({8: 14554,
         7: 3987,
         6: 6758,
         4: 4132,
         5: 167,
         3: 54,
         0: 102,
         2: 191,
         9: 93,
         1: 35})


retrieve_top_words(lda_tf, tfidf_feature_names, num_top_words)

Topic #0:
neon candle basic replacement seriously ok wood orange dissipates zero delivery juicy spice josie package

Topic #1:
iron feet curling tanning hot nuface curl cord heat mitt tanner barrel irons curler xentan

Topic #2:
perfume fragrance device wife like classalinknormal datahookproductlinklinked cologne loves likes wear floral bristles scents unique

Topic #3:
lashes dryer japonesque mud button mario tizo features eyelashes badescu settings tobacco tonic setting temperature

Topic #4:
color polish love nail nails coat essie coats beautiful colors pretty like pink time bottle

Topic #5:
brush shave shaving soap razor cream handle lather beard proraso black kit clippers thank close

Topic #6:
color great like foundation look makeup nice powder product really light coverage dark good apply

Topic #7:
hair scent like smell nice great smells spray just bottle love fragrance body pleasant strong

Topic #8:
skin product use face like using really cream used products just does im good dont

Topic #9:
ingredients oil acid extract list butter contains alcohol shea organic loccitane seed sodium fragrance active


nmf_tfidf = nmf.fit(tfidf)

/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead
  warnings.warn(
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:289: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1637: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn(


nmf_W = nmf_tfidf.transform(tfidf)
Counter([np.argmax(i) for i in nmf_W])

/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead
  warnings.warn(

Counter({9: 9065,
         0: 6608,
         2: 1795,
         8: 1233,
         3: 1723,
         1: 2345,
         7: 1310,
         5: 1100,
         6: 2324,
         4: 2570})


retrieve_top_words(nmf_tfidf, tfidf_feature_names, num_top_words)

Topic #0:
skin face cream dry use using sensitive products moisturizer feels feel cleanser feeling oily soft

Topic #1:
color beautiful pretty summer perfect coats pink favorite neutral bright essie looks fall picture awesome

Topic #2:
great works smells stuff summer price shipping color brand neutral fast looks item quality seller

Topic #3:
love products color stuff cnd colors brand essie absolutely amazing compliments shade smell shellac new

Topic #4:
hair shampoo conditioner use iron dry spray fine hold curls makes soft used works scalp

Topic #5:
good price stuff smells coverage pretty quality job works feels does overall looks polish buy

Topic #6:
polish nail essie nails coat colors coats polishes favorite chips opi chip pink best fast

Topic #7:
product excellent years using used described recommend wonderful quality received use price products time results

Topic #8:
nice scent smells smell light color really feels bold little smooth fragrance different stays clean

Topic #9:
like really just little use foundation makeup dont scent look does long bit powder day


lda_tfidf = lda.fit(tfidf)


lda_W = lda_tfidf.transform(tfidf)
Counter([np.argmax(i) for i in lda_W])

Counter({6: 29037,
         5: 91,
         3: 39,
         8: 718,
         0: 49,
         7: 19,
         2: 27,
         4: 38,
         1: 42,
         9: 13})


retrieve_top_words(lda_tfidf, tfidf_feature_names, num_top_words)

Topic #0:
ok charcoal speedy ty lengthens responsive handcream topicals temptu winterlike bronco gulsha aa irrelevant wellso

Topic #1:
tizo extract glycol sodium seed buttery ci glycerin dimethicone cashmere leaf cinnamon phenoxyethanol precious alright

Topic #2:
elta md raspberry perfecto expectedits browngray mystery shown massages paints baroness overthetop rite bizarre untreated

Topic #3:
tobacco vibrant aaa sparkling excelente authenticity thigh inserts depicted doubtful valid granuals neiman offenders behold

Topic #4:
awesome gifting colore dusty coater pans opaqueness allinone channel midnight crystals nephew manis preserving daves

Topic #5:
described item purchase waste promptly exppected money def useful arrived attractively excelent nostrils advertising width

Topic #6:
skin color product like use hair great good love really nice face just does scent

Topic #7:
cute diamond dissipated leans vibration gardening choo social sisters reordered vampire represented lends overdoing whisper

Topic #8:
love great color product watered toothpaste michelle complaints unexpected looks spirit steady stains latte basecoatbut

Topic #9:
underarms eyeliners typehidden videoblockdivinput div classvideourlinput aspacingsmall aspacingtopmini classasection bathe recipient swatched rules crisco omg


tfidf.shape

(30073, 33037)


sig_kern = sigmoid_kernel(tfidf, tfidf)


sig_kern.shape

(30073, 30073)


sig_kern

array([[0.76160687, 0.76159416, 0.76159511, ..., 0.76159416, 0.76159416,
        0.76159416],
       [0.76159416, 0.76160687, 0.76159545, ..., 0.76159416, 0.76159416,
        0.76159416],
       [0.76159511, 0.76159545, 0.76160687, ..., 0.76159416, 0.76159416,
        0.76159416],
       ...,
       [0.76159416, 0.76159416, 0.76159416, ..., 0.76160687, 0.76159416,
        0.76159416],
       [0.76159416, 0.76159416, 0.76159416, ..., 0.76159416, 0.76160687,
        0.76159455],
       [0.76159416, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159455,
        0.76160687]])


index = pd.Series(beauty.index, index=beauty['title']).drop_duplicates()


index

title
Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ        0
Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ        1
Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ        2
Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ        3
Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ        4
                                                                                         ...  
TIZO Photoceutical AM Replenish SPF 40 Sunscreen Primer, 1 fl. oz.                       30068
TIZO Photoceutical AM Replenish SPF 40 Sunscreen Primer, 1 fl. oz.                       30069
ORIBE Bright Blonde Radiance and Repair Treatment, 4.2 fl. oz.                           30070
ORIBE Bright Blonde Radiance and Repair Treatment, 4.2 fl. oz.                           30071
ELEMIS Superfood Facial Oil - Nourishing Face Oil, 0.5 fl. oz.                           30072
Length: 30073, dtype: int64


def recommend_beauty(name, sig_kern=sig_kern):
    indx = index[name]
    sigmoid_score = list(enumerate(sig_kern[indx]))
    sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True)
    sigmoid_score = sigmoid_score[1:4]
    position = [i[0] for i in sigmoid_score]
    return beauty.iloc[position]


your_favorite_beauty_title = "Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ"


beauty[beauty.title == your_favorite_beauty_title]


recommend_beauty(your_favorite_beauty_title)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [145], in <cell line: 1>()
----> 1 recommend_beauty(your_favorite_beauty_title)

Input In [140], in recommend_beauty(name, sig_kern)
      2 indx = index[name]
      3 sigmoid_score = list(enumerate(sig_kern[indx]))
----> 4 sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True)
      5 sigmoid_score = sigmoid_score[1:4]
      6 position = [i[0] for i in sigmoid_score]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


beauty.head(3)


all_nails = beauty.copy()


all_nails['title_processed'] = all_nails['title'].apply(lambda x:preprocess(str(x)))
all_nails['title_processed'] = all_nails['title_processed'].apply(lambda x:x.split())
all_nails.head(3)


ls = []
for i in all_nails.index:
    if "nail" in all_nails.loc[i,'title_processed']:
        ls.append(i)


len(ls)

6809


all_nails = all_nails.iloc[ls,:]


all_nails = all_nails.reset_index(drop = True)


all_nails.head(3)


all_nails.overall.value_counts().sort_index(ascending = False)

5    4673
4     908
3     567
2     323
1     338
Name: overall, dtype: int64


all_nails.to_csv("all_nails.csv")


import re
from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  text = text.lower() #lowercase
  text = re.sub(r'[^\w\s]', '', text) #remove punctuations
  text = re.sub(r'\d+', '', text) #remove numbers
  text = " ".join(text.split()) #stripWhitespace
  text = text.split()
  text = [x for x in text if x not in stop_words] #remove stopwords
  text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
                                      'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
  text = " ".join(text)
  # stemmer_ps = PorterStemmer()  
  # text = [stemmer_ps.stem(word) for word in text.split()] #stemming
  # text = " ".join(text)
  # lemmatizer = WordNetLemmatizer()
  # text = [lemmatizer.lemmatize(word) for word in text.split()]  #lemmatization
  # text = " ".join(text)
  return(text)


all_nails['review_processed']=all_nails['reviewText'].apply(lambda x:preprocess(x))
all_nails['review_processed']=all_nails['review_processed'].apply(lambda x:x.split())


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in all_nails.review_processed[(all_nails.overall == 4)|(all_nails.overall == 5)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in all_nails.review_processed[(all_nails.overall == 1)|(all_nails.overall == 2)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


from gensim.models.word2vec import Word2Vec


model = Word2Vec(sentences=all_nails['review_processed'].tolist(), 
                 size=100, sg=1,
                 min_count=5,window=10,
                 workers=-1,seed=10,iter=250)


vocab = model.wv.index2word
len(vocab)

1939


model.wv.most_similar('disappointed', topn=10)

[('imatation', 0.3182666301727295),
 ('lost', 0.30434471368789673),
 ('doesnt', 0.30145254731178284),
 ('effectively', 0.2999058663845062),
 ('noting', 0.2995549440383911),
 ('needs', 0.2948138117790222),
 ('clearly', 0.28421464562416077),
 ('shoot', 0.279448926448822),
 ('instead', 0.27057692408561707),
 ('cuticles', 0.26807481050491333)]


from gensim import corpora
dictionary = corpora.Dictionary(all_nails['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF


all_nails['review_ids']=all_nails['review_processed'].apply(lambda x:dictionary.doc2bow(x))


from gensim import models
num_topics=7
ldamodel = models.ldamodel.LdaModel(all_nails.review_ids[(all_nails.overall == 4)|(all_nails.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.025*"perfect" + 0.024*"beautiful" + 0.015*"great" + 0.015*"goes" + 0.015*"colors" + 0.014*"time"')
(1, '0.041*"great" + 0.017*"coat" + 0.011*"looks" + 0.009*"coats" + 0.009*"bottle" + 0.009*"used"')
(2, '0.027*"great" + 0.018*"nice" + 0.015*"beautiful" + 0.015*"perfect" + 0.013*"fast" + 0.013*"looks"')
(3, '0.019*"coats" + 0.018*"pretty" + 0.012*"shade" + 0.011*"perfect" + 0.011*"little" + 0.011*"good"')
(4, '0.017*"pink" + 0.016*"coat" + 0.015*"dark" + 0.015*"colors" + 0.015*"looks" + 0.012*"great"')
(5, '0.044*"great" + 0.023*"good" + 0.020*"pink" + 0.015*"summer" + 0.013*"bright" + 0.010*"time"')
(6, '0.028*"pretty" + 0.017*"nice" + 0.015*"coats" + 0.013*"use" + 0.013*"look" + 0.011*"great"')


from gensim import models
num_topics=7
ldamodel = models.ldamodel.LdaModel(all_nails.review_ids[(all_nails.overall == 1)|(all_nails.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.020*"coats" + 0.012*"coat" + 0.011*"disappointed" + 0.011*"quality" + 0.010*"looks" + 0.010*"time"')
(1, '0.013*"colors" + 0.013*"coat" + 0.012*"chips" + 0.012*"bought" + 0.012*"great" + 0.009*"better"')
(2, '0.011*"coat" + 0.010*"picture" + 0.009*"pink" + 0.008*"nice" + 0.008*"u" + 0.008*"chips"')
(3, '0.021*"disappointed" + 0.019*"coats" + 0.010*"wont" + 0.009*"time" + 0.009*"does" + 0.008*"pink"')
(4, '0.018*"pink" + 0.011*"coat" + 0.010*"looking" + 0.009*"disappointment" + 0.009*"money" + 0.008*"coats"')
(5, '0.013*"dont" + 0.012*"doesnt" + 0.011*"looks" + 0.008*"coats" + 0.007*"use" + 0.007*"green"')
(6, '0.019*"coats" + 0.012*"use" + 0.009*"quality" + 0.009*"better" + 0.008*"purchased" + 0.008*"pretty"')


essie = all_nails.copy()


ls = []
for i in essie.index:
    if "essie" in essie.loc[i,'title_processed']:
        ls.append(i)

essie = essie.iloc[ls,:]
essie = essie.reset_index(drop = True)


len(essie)

3641


essie.overall.value_counts().sort_index(ascending = False)

5    2384
4     508
3     320
2     206
1     223
Name: overall, dtype: int64


import re
from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  text = text.lower() #lowercase
  text = re.sub(r'[^\w\s]', '', text) #remove punctuations
  text = re.sub(r'\d+', '', text) #remove numbers
  text = " ".join(text.split()) #stripWhitespace
  text = text.split()
  text = [x for x in text if x not in stop_words] #remove stopwords
  text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
                                      'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
  text = " ".join(text)
  # stemmer_ps = PorterStemmer()  
  # text = [stemmer_ps.stem(word) for word in text.split()] #stemming
  # text = " ".join(text)
  # lemmatizer = WordNetLemmatizer()
  # text = [lemmatizer.lemmatize(word) for word in text.split()]  #lemmatization
  # text = " ".join(text)
  return(text)

essie['review_processed']=essie['reviewText'].apply(lambda x:preprocess(x))
essie['review_processed']=essie['review_processed'].apply(lambda x:x.split())


print(essie.reviewText.unique())

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [1], in <cell line: 1>()
----> 1 print(essie.reviewText.unique())

NameError: name 'essie' is not defined


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in essie.review_processed[(essie.overall == 4)|(essie.overall == 5)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in essie.review_processed[(essie.overall == 1)|(essie.overall == 2)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


from gensim.models.word2vec import Word2Vec

model = Word2Vec(sentences=essie['review_processed'].tolist(), 
                 size=100, sg=1,
                 min_count=5,window=10,
                 workers=-1,seed=10,iter=250)

vocab = model.wv.index2word


model.wv.most_similar('disappointed', topn=10)

[('imatation', 0.3182666301727295),
 ('doesnt', 0.30145254731178284),
 ('effectively', 0.29990583658218384),
 ('needs', 0.2948138117790222),
 ('instead', 0.27057692408561707),
 ('iridescent', 0.2598556876182556),
 ('berry', 0.2544700801372528),
 ('lovers', 0.25341448187828064),
 ('fitting', 0.2525997757911682),
 ('seche', 0.24726007878780365)]


from gensim import corpora
dictionary = corpora.Dictionary(essie['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF


essie['review_ids']=essie['review_processed'].apply(lambda x:dictionary.doc2bow(x))


from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(essie.review_ids[(essie.overall == 4)|(essie.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.053*"great" + 0.016*"coats" + 0.015*"summer" + 0.012*"coat" + 0.012*"time" + 0.011*"favorite"')
(1, '0.032*"perfect" + 0.031*"pink" + 0.028*"great" + 0.022*"beautiful" + 0.019*"summer" + 0.015*"time"')
(2, '0.024*"nice" + 0.019*"perfect" + 0.018*"good" + 0.016*"beautiful" + 0.014*"time" + 0.014*"colors"')
(3, '0.033*"pretty" + 0.018*"pink" + 0.018*"favorite" + 0.017*"coats" + 0.017*"great" + 0.015*"colors"')
(4, '0.037*"great" + 0.020*"looks" + 0.013*"coat" + 0.010*"skin" + 0.010*"neon" + 0.010*"beautiful"')


from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(essie.review_ids[(essie.overall == 1)|(essie.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.030*"coat" + 0.028*"coats" + 0.022*"pink" + 0.016*"u" + 0.015*"does" + 0.012*"streaky"')
(1, '0.021*"coats" + 0.019*"doesnt" + 0.014*"matte" + 0.014*"bought" + 0.013*"colors" + 0.013*"looks"')
(2, '0.022*"colors" + 0.019*"gray" + 0.015*"dries" + 0.015*"looks" + 0.014*"shown" + 0.013*"pretty"')
(3, '0.018*"quality" + 0.015*"looks" + 0.015*"coats" + 0.012*"bought" + 0.011*"amazon" + 0.011*"dont"')
(4, '0.037*"disappointed" + 0.034*"coats" + 0.013*"good" + 0.012*"formula" + 0.012*"able" + 0.012*"applied"')


opi = all_nails.copy()

ls = []
for i in opi.index:
    if "opi" in opi.loc[i,'title_processed']:
        ls.append(i)

opi = opi.iloc[ls,:]
opi = opi.reset_index(drop = True)


len(opi)

678


opi.overall.value_counts().sort_index(ascending = False)

5    492
4     82
3     53
2     23
1     28
Name: overall, dtype: int64


import re
from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  text = text.lower() #lowercase
  text = re.sub(r'[^\w\s]', '', text) #remove punctuations
  text = re.sub(r'\d+', '', text) #remove numbers
  text = " ".join(text.split()) #stripWhitespace
  text = text.split()
  text = [x for x in text if x not in stop_words] #remove stopwords
  text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
                                      'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
  text = " ".join(text)
  # stemmer_ps = PorterStemmer()  
  # text = [stemmer_ps.stem(word) for word in text.split()] #stemming
  # text = " ".join(text)
  # lemmatizer = WordNetLemmatizer()
  # text = [lemmatizer.lemmatize(word) for word in text.split()]  #lemmatization
  # text = " ".join(text)
  return(text)

opi['review_processed']=opi['reviewText'].apply(lambda x:preprocess(x))
opi['review_processed']=opi['review_processed'].apply(lambda x:x.split())


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in opi.review_processed[(opi.overall == 4)|(opi.overall == 5)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in opi.review_processed[(opi.overall == 1)|(opi.overall == 2)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


from gensim.models.word2vec import Word2Vec

model = Word2Vec(sentences=opi['review_processed'].tolist(), 
                 size=100, sg=1,
                 min_count=5,window=10,
                 workers=-1,seed=10,iter=250)

vocab = model.wv.index2word


model.wv.most_similar('good', topn=10)

[('gave', 0.3487659990787506),
 ('pinterst', 0.2854136824607849),
 ('secure', 0.28510773181915283),
 ('medium', 0.2783038020133972),
 ('loved', 0.2766246199607849),
 ('demure', 0.26437440514564514),
 ('collections', 0.2505810558795929),
 ('plum', 0.24891437590122223),
 ('needed', 0.2451680302619934),
 ('stain', 0.2412152886390686)]


from gensim import corpora
dictionary = corpora.Dictionary(opi['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF


opi['review_ids']=opi['review_processed'].apply(lambda x:dictionary.doc2bow(x))


from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(opi.review_ids[(opi.overall == 4)|(opi.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.019*"great" + 0.017*"beautiful" + 0.012*"perfect" + 0.011*"coat" + 0.010*"coats" + 0.007*"gorgeous"')
(1, '0.013*"coat" + 0.012*"great" + 0.012*"use" + 0.009*"base" + 0.009*"colors" + 0.009*"looks"')
(2, '0.021*"coat" + 0.013*"base" + 0.011*"beautiful" + 0.009*"nice" + 0.007*"used" + 0.007*"using"')
(3, '0.015*"use" + 0.011*"pink" + 0.010*"remover" + 0.010*"great" + 0.010*"coat" + 0.009*"works"')
(4, '0.034*"great" + 0.015*"little" + 0.014*"good" + 0.011*"coat" + 0.010*"use" + 0.009*"pretty"')


from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(opi.review_ids[(opi.overall == 1)|(opi.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.011*"gel" + 0.009*"coat" + 0.009*"day" + 0.008*"stick" + 0.007*"stuff" + 0.007*"brush"')
(1, '0.007*"day" + 0.007*"dont" + 0.006*"nude" + 0.006*"waste" + 0.006*"money" + 0.006*"use"')
(2, '0.012*"coat" + 0.007*"mess" + 0.006*"time" + 0.006*"dirty" + 0.005*"hands" + 0.005*"sure"')
(3, '0.011*"yellow" + 0.010*"look" + 0.006*"didnt" + 0.006*"pink" + 0.005*"white" + 0.005*"time"')
(4, '0.008*"coat" + 0.008*"dont" + 0.007*"use" + 0.006*"brand" + 0.006*"didnt" + 0.005*"disappointed"')


cnd_ls = []
for i in beauty.index:
    if "nail" in str(beauty.iloc[i]["title"]).lower() or "nails" in str(beauty.iloc[i]["title"]).lower():
        if "cnd" in str(beauty.iloc[i]["title"]).lower() or "creative nail" in str(beauty.iloc[i]["title"]).lower():
            cnd_ls.append(i)


cnd= beauty.iloc[cnd_ls,:]


len(cnd)

1838


cnd.overall.value_counts().sort_index(ascending = False)

5    1393
4     220
3     121
2      60
1      44
Name: overall, dtype: int64


cnd.head(3)


import re
from sklearn import feature_extraction 
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def preprocess(text):
  text = text.lower() #lowercase
  text = re.sub(r'[^\w\s]', '', text) #remove punctuations
  text = re.sub(r'\d+', '', text) #remove numbers
  text = " ".join(text.split()) #stripWhitespace
  text = text.split()
  text = [x for x in text if x not in stop_words] #remove stopwords
  text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
                                      'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
  text = " ".join(text)
  # stemmer_ps = PorterStemmer()  
  # text = [stemmer_ps.stem(word) for word in text.split()] #stemming
  # text = " ".join(text)
  # lemmatizer = WordNetLemmatizer()
  # text = [lemmatizer.lemmatize(word) for word in text.split()]  #lemmatization
  # text = " ".join(text)
  return(text)

cnd['review_processed']=cnd['reviewText'].apply(lambda x:preprocess(x))
cnd['review_processed']=cnd['review_processed'].apply(lambda x:x.split())

/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_38449/2805646083.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnd['review_processed']=cnd['reviewText'].apply(lambda x:preprocess(x))
/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_38449/2805646083.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnd['review_processed']=cnd['review_processed'].apply(lambda x:x.split())


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in cnd.review_processed[(cnd.overall == 4)|(cnd.overall == 5)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
text = " ".join(word for word in cnd.review_processed[(cnd.overall == 1)|(cnd.overall == 2)].astype(str))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(text)

# Visualize the word cloud
wordcloud.to_image()


from gensim.models.word2vec import Word2Vec

model = Word2Vec(sentences=cnd['review_processed'].tolist(), 
                 size=100, sg=1,
                 min_count=5,window=10,
                 workers=-1,seed=10,iter=250)

vocab = model.wv.index2word


model.wv.most_similar('disappointed', topn=10)

[('doesnt', 0.30145254731178284),
 ('needs', 0.2948138117790222),
 ('clearly', 0.28421464562416077),
 ('instead', 0.27057692408561707),
 ('cuticles', 0.26807481050491333),
 ('mail', 0.2597600817680359),
 ('st', 0.25243714451789856),
 ('seller', 0.24061936140060425),
 ('easily', 0.23203958570957184),
 ('peel', 0.23144644498825073)]


from gensim import corpora
dictionary = corpora.Dictionary(cnd['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF


cnd['review_ids']=cnd['review_processed'].apply(lambda x:dictionary.doc2bow(x))

/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_38449/120259428.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnd['review_ids']=cnd['review_processed'].apply(lambda x:dictionary.doc2bow(x))


from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(cnd.review_ids[(cnd.overall == 4)|(cnd.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.019*"pretty" + 0.017*"shellac" + 0.015*"good" + 0.015*"great" + 0.011*"use" + 0.011*"coats"')
(1, '0.032*"great" + 0.017*"shellac" + 0.014*"colors" + 0.012*"weeks" + 0.011*"light" + 0.011*"use"')
(2, '0.025*"great" + 0.022*"shellac" + 0.012*"light" + 0.011*"wear" + 0.011*"lasts" + 0.010*"week"')
(3, '0.021*"coat" + 0.012*"colors" + 0.012*"shellac" + 0.011*"nice" + 0.009*"coats" + 0.009*"use"')
(4, '0.035*"great" + 0.028*"shellac" + 0.017*"colors" + 0.013*"red" + 0.012*"dark" + 0.011*"look"')


from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(cnd.review_ids[(cnd.overall == 1)|(cnd.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.009*"shellac" + 0.008*"good" + 0.007*"use" + 0.006*"looking" + 0.006*"pretty" + 0.006*"finish"')
(1, '0.012*"coats" + 0.009*"shellac" + 0.006*"colors" + 0.006*"days" + 0.006*"pink" + 0.006*"use"')
(2, '0.012*"light" + 0.006*"idea" + 0.005*"bad" + 0.005*"pretty" + 0.005*"manicure" + 0.005*"way"')
(3, '0.015*"pink" + 0.010*"money" + 0.009*"return" + 0.009*"did" + 0.008*"brown" + 0.008*"dont"')
(4, '0.008*"glitter" + 0.007*"peels" + 0.007*"easily" + 0.007*"shellac" + 0.006*"colors" + 0.006*"pink"')

	overall	verified	reviewTime	reviewerID	asin	reviewerName	reviewText	summary	unixReviewTime
0	5	True	01 5, 2018	A2HOI48JK8838M	B00004U9V2	DB	This handcream has a beautiful fragrance. It d...	Beautiful Fragrance	1515110400
1	5	True	04 5, 2017	A1YIPEY7HX73S7	B00004U9V2	Ajaey	wonderful hand lotion, for seriously dry skin,...	wonderful hand lotion	1491350400
2	5	True	03 27, 2017	A2QCGHIJ2TCLVP	B00004U9V2	D. Jones	Best hand cream around. Silky, thick, soaks i...	Best hand cream around	1490572800

	overall	verified	reviewTime	reviewerID	asin	reviewerName	reviewText	summary	unixReviewTime	title
0	5	True	01 5, 2018	A2HOI48JK8838M	B00004U9V2	DB	This handcream has a beautiful fragrance. It d...	Beautiful Fragrance	1515110400	Crabtree & Evelyn - Gardener's Ultra-Moist...
1	5	True	01 5, 2018	A2HOI48JK8838M	B00004U9V2	DB	This handcream has a beautiful fragrance. It d...	Beautiful Fragrance	1515110400	Crabtree & Evelyn - Gardener's Ultra-Moist...
2	5	True	04 5, 2017	A1YIPEY7HX73S7	B00004U9V2	Ajaey	wonderful hand lotion, for seriously dry skin,...	wonderful hand lotion	1491350400	Crabtree & Evelyn - Gardener's Ultra-Moist...

	asin	title	summary	reviewText	overall	reviewerID	reviewerName	reviewTime	unixReviewTime
0	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	Beautiful Fragrance	This handcream has a beautiful fragrance. It d...	5	A2HOI48JK8838M	DB	2018-01-05	1515110400
1	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	Beautiful Fragrance	This handcream has a beautiful fragrance. It d...	5	A2HOI48JK8838M	DB	2018-01-05	1515110400
2	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	wonderful hand lotion	wonderful hand lotion, for seriously dry skin,...	5	A1YIPEY7HX73S7	Ajaey	2017-04-05	1491350400

	asin	title	summary	reviewText	overall	reviewerID	reviewerName	reviewTime	unixReviewTime	year	month
0	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	Beautiful Fragrance	This handcream has a beautiful fragrance. It d...	5	A2HOI48JK8838M	DB	2018-01-05	1515110400	2018	1
1	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	Beautiful Fragrance	This handcream has a beautiful fragrance. It d...	5	A2HOI48JK8838M	DB	2018-01-05	1515110400	2018	1
2	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	wonderful hand lotion	wonderful hand lotion, for seriously dry skin,...	5	A1YIPEY7HX73S7	Ajaey	2017-04-05	1491350400	2017	4

	asin	title	summary	reviewText	overall	reviewerID	reviewerName	reviewTime	unixReviewTime	year	month	review_processed
0	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	Beautiful Fragrance	This handcream has a beautiful fragrance. It d...	5	A2HOI48JK8838M	DB	2018-01-05	1515110400	2018	1	[handcream, beautiful, fragrance, doesnt, stay...
1	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	wonderful hand lotion	wonderful hand lotion, for seriously dry skin,...	5	A1YIPEY7HX73S7	Ajaey	2017-04-05	1491350400	2017	4	[wonderful, hand, lotion, seriously, dry, skin...
2	B00004U9V2	Crabtree & Evelyn - Gardener's Ultra-Moist...	Best hand cream around	Best hand cream around. Silky, thick, soaks i...	5	A2QCGHIJ2TCLVP	D. Jones	2017-03-27	1490572800	2017	3	[best, hand, cream, silky, soaks, way, leaving...

	asin	title	summary	reviewText	overall	reviewerID	reviewerName	reviewTime	unixReviewTime	year	month	review_processed	pos_neg	title_processed
0	B000142FVW	OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0...	Five Stars	Love this color and brand	5	AJ9VB45YRX8C9	Amazon Customer	2018-04-11	1523404800	2018	4	[love, color, brand]	1	[opi, nail, lacquer, boraboraing, pink, fl, oz]
1	B000142FVW	OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0...	Gifted	Was bought for a gift. Recipient was thrilled.	5	A39ECS1S0CJ0X9	Amy Z.	2018-04-03	1522713600	2018	4	[bought, gift, recipient, thrilled]	1	[opi, nail, lacquer, boraboraing, pink, fl, oz]
2	B000142FVW	OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0...	Five Stars	looks great	5	AI2YYV9D3LS8P	breezy deaton	2018-03-18	1521331200	2018	3	[looks, great]	1	[opi, nail, lacquer, boraboraing, pink, fl, oz]

	id	word
0	0	brand
1	1	bought
2	2	gift
3	3	recipient
4	4	thrilled
...	...	...
5077	5077	germany
5078	5078	grooves
5079	5079	maraschino
5080	5080	traveling
5081	5081	wider

	id	word
0	0	amazing
1	1	awesome
2	2	look
3	3	neon
4	4	perfect
...	...	...
1326	1326	running
1327	1327	cartel
1328	1328	visible
1329	1329	priced
1330	1330	reasonably

	asin	title	summary	reviewText	overall	reviewerID	reviewerName	reviewTime	unixReviewTime	year	month	review_processed	pos_neg
6889	B0016LTZD0	Creative Nail SolarOil, 4 Fluid Ounce	Five Stars	way cheaper to buy the refill! works magic on...	5	A22PHKFFVGZ9X6	lisa_north atlanta	2015-08-30	1440892800	2015	8	[way, cheaper, buy, refill, works, magic, beat...	1
9982	B002K6AHQY	CND Vinylux Weekly Nail Polish, Rock Royalty,...	Lovely!	This is CND's new product. The product itself...	4	A2FW71YE37Q2YO	JPS	2013-05-31	1369958400	2013	5	[cnds, new, product, product, awesome, color, ...	1
9983	B002K6AHQY	CND Vinylux Weekly Nail Polish, Rock Royalty,...	Nice color	This is CND's new product. The product itself...	4	A2FW71YE37Q2YO	JPS	2013-05-31	1369958400	2013	5	[cnds, new, product, product, awesome, color, ...	1

	id	word
0	0	beat
1	1	buy
2	2	cheaper
3	3	hands
4	4	magic
...	...	...
2529	2529	werent
2530	2530	editional
2531	2531	sparknlings
2532	2532	messing
2533	2533	meh

	id	word
0	0	brand
1	1	bought
2	2	gift
3	3	recipient
4	4	thrilled
...	...	...
1990	1990	knockwurst
1991	1991	pinky
1992	1992	taupe
1993	1993	dressy
1994	1994	garish

Final Team Project - Essie Market Research Report¶

Loading Data¶

Missing Values¶

Duplicated Values¶

Explore the dataset¶

Top 20 Reviewed Products¶

Overall Ratings Distribution¶

Exploratory Analysis¶

Data Processing¶

Insert pos_neg column for Sentiment modeling¶

Train/Test Split¶

Logistic Regression - Winner¶

CountVectorizer¶

Preformance¶

Multinomial Bayes¶

CountVectorizer¶

TfidfVectorizer¶

Clustering / Topic Modeling (NMF and Lda)¶

CountVectorizer & Tf-idf¶

Build Clustering Models (NMF & LDA)¶

Tf (NMF)¶

Tf (LDA)¶

Tfidf (NMF)¶

Tfidf (LDA)¶

Recommendation (unfinished)¶

All_Nails¶

Filter¶

Positive¶

Negative¶

Word2Vec¶

LDA¶

Positive¶

Negative¶

Essie¶

Filter¶

Positive¶

Negative¶

Word2Vec¶

LDA¶

Positive¶

Negative¶

OPI¶

Filter¶

Positive¶

Negative¶

Word2Vec¶

LDA¶

CND¶

Filter¶

Word2Vec¶

LDA¶